pacman::p_load(readr,
dplyr,
tidyverse,
countrycode,
terra,
geodata,
purrr,
scales,
ggplot2,
plotly,
janitor,
lubridate)Choose one of the datasets that you’ve worked with in the Lab or any of the other assignments (e.g., the astronauts dataset from Assignment 1).
Then, choose one or several of the packages presented at the workshop and use its functionality/tool to do something with the chosen dataset.
The output should involve at least two of the following: (a) a plot, (b) a table, and (c) modeling in some form. The format should be knitted Rmd to HTML and submitted to me via email by November 15, 6pm.
I will be using the data from the Stavnichuk and Corlett article, with the specific data set prepared by Georgios Karamanis which contains publically available information about all astronauts who participated in space missions before 15 January 2020 collected from NASA, Roscosmos, and fun-made websites. The provided information includes full astronaut name, sex, date of birth, nationality, military status, a title and year of a selection program, and information about each mission completed by a particular astronaut such as a year, ascend and descend shuttle names, mission and extravehicular activity (EVAs) durations.
As for the packages, I will be using Janitor, Lubridate, and Plotly. The goal of the task will be to clean the data set, create a variable of interest which is time-relevant, and create an interactive plot.
The first part of the task consisted of importing the data and preparing the data for the plot. In this part, I have also did an analysis of variables as demonstrated in the Janitor workshop.
# Importing dataset and creating subset according to year of birth
astronauts <- read_csv("astronauts.csv")
# Adjusting column names and using the Janitor package.
astronauts_clean <- astronauts %>%
clean_names() %>%
remove_empty()
print(colnames(astronauts_clean)) [1] "id" "number"
[3] "nationwide_number" "name"
[5] "original_name" "sex"
[7] "year_of_birth" "nationality"
[9] "military_civilian" "selection"
[11] "year_of_selection" "mission_number"
[13] "total_number_of_missions" "occupation"
[15] "year_of_mission" "mission_title"
[17] "ascend_shuttle" "in_orbit"
[19] "descend_shuttle" "hours_mission"
[21] "field21" "eva_hrs_mission"
[23] "total_eva_hrs"
# Storing the date of the dataset (15 January 2020) as an object
date_of_dataset <- make_date(2020,01,15)
astronauts_clean_duration <- astronauts_clean %>%
mutate(date_mission = make_date(year_of_mission,1,1)) %>%
mutate(time_since_mission = as.duration(date_of_dataset - date_mission))
astronauts_clean_duration <- astronauts_clean %>%
mutate(date_mission = make_date(year_of_mission,1,1)) %>%
mutate(time_since_mission = interval(date_mission, date_of_dataset)/years(1))With the clean data and the new variable, I then prepared a subset which summarizes the entire dataset to some variables of interest for the second part of the task.
astronauts_subset <- astronauts_clean_duration %>%
group_by(nationality) %>%
summarize(
count_astronauts = n(),
last_mission = round(min(time_since_mission),2))Given that the country names used correspond to older names or with spelling mistakes, I corrected the names by using a data set from the geodata package. I also used this package to add international country codes, which will be necessary for the following step (which is plotting the countries and the variable chosen on a map).
# Importing the "country_codes" data set from the geodata package
country_codes_df <- geodata::country_codes() %>%
select(ISO3, NAME) %>%
rename(code = 1) %>%
rename(nationality = 2)
# Testing which countries have an outdated or misspelled name
outdated_country_names <- left_join(astronauts_subset,country_codes_df) %>%
select(nationality,code) %>%
filter(is.na(code))
# Creating a vector of old names (to be replaced)
old_names <- c(
"Czechoslovakia",
"Hungry",
"Korea",
"Malysia",
"Netherland",
"Republic of South Africa",
"U.K.",
"U.K./U.S.",
"U.S.",
"U.S.S.R/Russia",
"U.S.S.R/Ukraine",
"UAE"
)
# Creating a vector of new names (replacements)
new_names <- c(
"Czech Republic",
"Hungary",
"South Korea",
"Malaysia",
"Netherlands",
"South Africa",
"United Kingdom",
"United Kingdom",
"United States",
"Russia",
"Ukraine",
"United Arab Emirates"
)
# Creating a function to substitute the names
update_names_func <- function(df, old_names, new_names){
df %>%
mutate(nationality = recode(nationality, !!!setNames(new_names, old_names)))
}
# Changing the names by iterating on the names vector with a map_df function
updated_astronauts_df <- map_df(list(astronauts_subset), update_names_func, old_names, new_names)
# Adding the country codes to the data set, so that I can use them with the plot_ly function
nationalities_with_code <- left_join(updated_astronauts_df,country_codes_df) # Plotting an interactive map showing the time passed since the last mission, as of the publication of the dataset
map <- plot_ly(
data=nationalities_with_code,
type='choropleth',
locations=nationalities_with_code$code,
z=nationalities_with_code$last_mission,
text=nationalities_with_code$nationality,
colorscale="YlOrRd") %>%
layout(title = "Astronaut Missions by Country: <br> Years Since Last Mission (as of 15 January 2020)")
map